{
  "cells": [
    {
      "cell_type": "code",
      "execution_count": 1,
      "metadata": {
        "id": "ifmV02zKlsCs"
      },
      "outputs": [],
      "source": [
        "!pip install -qU \\\n",
        "  datasets==2.14.6 \\\n",
        "  voyageai==0.1.3"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "Q4vTJ-pFmWl5"
      },
      "source": [
        "## Dataset Download"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "GFaaDw5VmZEk"
      },
      "source": [
        "We're going to test with a more real world use-case, with messy, imperfect data. We will use the [`jamescalam/ai-arxiv-chunked`](https://huggingface.co/datasets/jamescalam/ai-arxiv-chunked) dataset."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 2,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "4-FqcdKHmVpa",
        "outputId": "d69dd348-29f9-4748-bfce-551820f46d8e"
      },
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "Dataset({\n",
              "    features: ['doi', 'chunk-id', 'chunk', 'id', 'title', 'summary', 'source', 'authors', 'categories', 'comment', 'journal_ref', 'primary_category', 'published', 'updated', 'references'],\n",
              "    num_rows: 41584\n",
              "})"
            ]
          },
          "metadata": {},
          "execution_count": 2
        }
      ],
      "source": [
        "from datasets import load_dataset\n",
        "\n",
        "data = load_dataset(\"jamescalam/ai-arxiv-chunked\", split=\"train\")\n",
        "data"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "gp5a_bInyfdX"
      },
      "source": [
        "First we define our embedding function."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 3,
      "metadata": {
        "id": "oG6zd1dLw54w"
      },
      "outputs": [],
      "source": [
        "import os\n",
        "import voyageai\n",
        "\n",
        "voyageai.api_key = os.getenv(\"VOYAGE_API_KEY\") or \"YOUR_VOYAGE_API_KEY\"\n",
        "model_id = \"voyage-01\"\n",
        "\n",
        "def embed(docs: list[str]) -> list[list[float]]:\n",
        "    if len(docs) > 8:\n",
        "        raise ValueError(\"List of documents cannot be longer than 8\")\n",
        "    doc_embeds = voyageai.get_embeddings(docs, model=model_id)\n",
        "    return doc_embeds"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "1nvrNQSGXvEC"
      },
      "source": [
        "Use this to build a Numpy array of cohere embedding vectors."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 4,
      "metadata": {
        "id": "EdyWVR17zX7I",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 49,
          "referenced_widgets": [
            "d2476f6811e748598e4b537723473326",
            "557cafac9f624e62b64e556a83dae01e",
            "4a86079fae934818ad8a8f9fc80f269a",
            "11afb771042947bb800e88c542909fff",
            "70c0b9ec72294e7a9aa33c87eba22547",
            "558d5abb111742bcb079f48f2f009851",
            "ae5d64fc9fe0461db91127cd6cb9bb0f",
            "63f9d536cece42558f45003f89edb68a",
            "0289dde60afa489f984ec52ba75aa172",
            "3b2e86e7c3304d2e9118eca2d1ddee78",
            "f9fabbef12cd4668bea38fe02afff650"
          ]
        },
        "outputId": "8e557397-efdb-4e9e-ad70-7cdeb7f5ac55"
      },
      "outputs": [
        {
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "d2476f6811e748598e4b537723473326",
              "version_major": 2,
              "version_minor": 0
            },
            "text/plain": [
              "  0%|          | 0/5198 [00:00<?, ?it/s]"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        }
      ],
      "source": [
        "from tqdm.auto import tqdm\n",
        "import numpy as np\n",
        "\n",
        "chunks = data[\"chunk\"]\n",
        "batch_size = 8\n",
        "\n",
        "for i in tqdm(range(0, len(chunks), batch_size)):\n",
        "    i_end = min(len(chunks), i+batch_size)\n",
        "    chunk_batch = chunks[i:i_end]\n",
        "    # embed current batch\n",
        "    embed_batch = embed(chunk_batch)\n",
        "    # add to existing np array if exists\n",
        "    if i == 0:\n",
        "        arr = embed_batch.copy()\n",
        "    else:\n",
        "        arr = np.concatenate(\n",
        "            [arr, embed_batch.copy()\n",
        "        ])"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "Bl9g3ePt029u"
      },
      "source": [
        "Now we need to create the query mechanism, this is simply a cosine similarity calculation between a query vector and our `arr` vectors."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 13,
      "metadata": {
        "id": "MR7WyDiatlsX"
      },
      "outputs": [],
      "source": [
        "from numpy.linalg import norm\n",
        "\n",
        "# convert chunks list to array for easy indexing\n",
        "chunk_arr = np.array(chunks)\n",
        "\n",
        "def query(text: str, top_k: int=3) -> list[str]:\n",
        "    # create query embedding\n",
        "    xq = np.array(embed([text])[0])\n",
        "    # calculate cosine similarities\n",
        "    sim = np.dot(arr, xq.T) / \\\n",
        "        (norm(arr, axis=1)*norm(xq.T))\n",
        "    # get indices of top_k records\n",
        "    idx = np.argpartition(sim, -top_k)[-top_k:]\n",
        "    docs = chunk_arr[idx]\n",
        "    for d in docs.tolist():\n",
        "        print(d)\n",
        "        print(\"----------\")"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "len(embed([\"text\"])[0])"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "-6EzaPhcVaYw",
        "outputId": "4841332b-3a95-4059-f54d-afd9681890c6"
      },
      "execution_count": 14,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "1024"
            ]
          },
          "metadata": {},
          "execution_count": 14
        }
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 15,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "u2NjYxsn7J5f",
        "outputId": "bd0242ee-31d8-476a-f08f-112dc808c4b6"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "models will be released as we improve model safety with community feedback.\n",
            "License A custom commercial license is available at: ai.meta.com/resources/\n",
            "models-and-libraries/llama-downloads/\n",
            "Where to send commentsInstructions on how to provide feedback or comments on the model can be\n",
            "found in the model README, or by opening an issue in the GitHub repository\n",
            "(https://github.com/facebookresearch/llama/ ).\n",
            "Intended Use\n",
            "Intended Use Cases L/l.sc/a.sc/m.sc/a.sc /two.taboldstyle is intended for commercial and research use in English. Tuned models\n",
            "are intended for assistant-like chat, whereas pretrained models can be adapted\n",
            "for a variety of natural language generation tasks.\n",
            "Out-of-Scope Uses Use in any manner that violates applicable laws or regulations (including trade\n",
            "compliancelaws). UseinlanguagesotherthanEnglish. Useinanyotherway\n",
            "that is prohibited by the Acceptable Use Policy and Licensing Agreement for\n",
            "L/l.sc/a.sc/m.sc/a.sc /two.taboldstyle.\n",
            "Hardware and Software (Section 2.2)\n",
            "Training Factors We usedcustomtraininglibraries, Meta’sResearchSuperCluster, andproductionclustersforpretraining. Fine-tuning,annotation,andevaluationwerealso\n",
            "----------\n",
            "asChatGPT,BARD,andClaude. TheseclosedproductLLMsareheavilyﬁne-tunedtoalignwithhuman\n",
            "preferences, which greatly enhances their usability and safety. This step can require signiﬁcant costs in\n",
            "computeandhumanannotation,andisoftennottransparentoreasilyreproducible,limitingprogresswithin\n",
            "the community to advance AI alignment research.\n",
            "In this work, we develop and release Llama 2, a family of pretrained and ﬁne-tuned LLMs, L/l.sc/a.sc/m.sc/a.sc /two.taboldstyle and\n",
            "L/l.sc/a.sc/m.sc/a.sc /two.taboldstyle-C/h.sc/a.sc/t.sc , at scales up to 70B parameters. On the series of helpfulness and safety benchmarks we tested,\n",
            "L/l.sc/a.sc/m.sc/a.sc /two.taboldstyle-C/h.sc/a.sc/t.sc models generally perform better than existing open-source models. They also appear to\n",
            "be on par with some of the closed-source models, at least on the human evaluations we performed (see\n",
            "----------\n",
            "Alan Schelten Ruan Silva Eric Michael Smith Ranjan Subramanian Xiaoqing Ellen Tan Binh Tang\n",
            "Ross Taylor Adina Williams Jian Xiang Kuan Puxin Xu Zheng Yan Iliyan Zarov Yuchen Zhang\n",
            "Angela Fan Melanie Kambadur Sharan Narang Aurelien Rodriguez Robert Stojnic\n",
            "Sergey Edunov Thomas Scialom\u0003\n",
            "GenAI, Meta\n",
            "Abstract\n",
            "In this work, we develop and release Llama 2, a collection of pretrained and ﬁne-tuned\n",
            "large language models (LLMs) ranging in scale from 7 billion to 70 billion parameters.\n",
            "Our ﬁne-tuned LLMs, called L/l.sc/a.sc/m.sc/a.sc /two.taboldstyle-C/h.sc/a.sc/t.sc , are optimized for dialogue use cases. Our\n",
            "models outperform open-source chat models on most benchmarks we tested, and based on\n",
            "ourhumanevaluationsforhelpfulnessandsafety,maybeasuitablesubstituteforclosedsource models. We provide a detailed description of our approach to ﬁne-tuning and safety\n",
            "----------\n"
          ]
        }
      ],
      "source": [
        "query(\"why should I use llama 2?\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 16,
      "metadata": {
        "id": "ZSIBTMUy7qc0",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "677d4053-8009-4fab-af3d-672123d22f5e"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Asian 3 2.6%\n",
            "Black or African American 10 8.7%\n",
            "Hispanic, Latino, or Spanish 1 0.9%\n",
            "Middle Eastern or North African 1 0.9%\n",
            "Native Hawaiian or Paciﬁc Islander 1 0.9%\n",
            "White or Caucasian 94 81.7%\n",
            "Prefer not to say 1 0.9%\n",
            "Other 2 1.7%\n",
            "Education\n",
            "High school or some college 40 34.8%\n",
            "College degree 62 53.9%\n",
            "Graduate or professional degree 12 10.4%\n",
            "Prefer not to say 0 0%\n",
            "Other 1 0.9%\n",
            "Disability\n",
            "Hearing difﬁculty 0 0%\n",
            "Vision difﬁculty 1 0.9%\n",
            "Cognitive difﬁculty 1 0.9%\n",
            "Ambulatory (mobility) difﬁculty 4 3%\n",
            "Self-care difﬁculty 1 0.9%\n",
            "Other 2 1.5%\n",
            "None 106 92%\n",
            "Figure 4 Results of a demographic survey completed by 115of324red team members.\n",
            "model that evaluates the inherent efﬁcacy of a red team member, which we plot in Figure 5 (Right). We\n",
            "ﬁnd that some workers are particularly effective at red teaming, whereas others are not. In Appendix A.3 we\n",
            "----------\n",
            "cyber); ﬁndingsonthesetopicsweremarginal andweremitigated. Nonetheless, wewill continueourred\n",
            "teaming eﬀorts in this front.\n",
            "Todate,allofourredteamingeﬀortshavetargetedmodeloutputsinEnglish,buthavecruciallyincluded\n",
            "non-Englishpromptsanddialoguecontexts,asthatisawell-knownattackvector. Inallexercises,participants\n",
            "were given risk category deﬁnitions and were shown just a handful of examples of risky interactions with an\n",
            "LLM.Afterthat,eachparticipantwaspartofasubteamfocusedonaparticularcategoryofriskorattack\n",
            "vector. Aftercreatingeachdialogue,theredteamparticipantwouldannotatevariousattributes,including\n",
            "risk areas and degree of risk, as captured by a 5-point Likert scale.\n",
            "Some examples of useful insights provided by members of red teams that we were able to improve upon\n",
            "throughout development:\n",
            "•[Early models] were more likely to have generated unsafe responses without noting that they contain problematiccontent. However, [slightly later models] have tended todisplay knowledge\n",
            "that the content is problematic, even if they do go on to provide it. “They respond with ‘[UNSAFE\n",
            "----------\n",
            "vague answers due to context distillation). We thus leverage the safety reward model to decide whether to\n",
            "use safety context distillation – we keep the context-distilled output only on the examples where it gets a\n",
            "betterrewardmodelscorethantheoriginalanswer. Wenoticethatthisisparticularlyhelpfulonprompts\n",
            "that the model is very bad at, but limits the negative impact of context distillation (see Figure 16b).\n",
            "4.3 Red Teaming\n",
            "GivenhowbroadthecapabilitiesofLLMsareandhowvariedtheirtrainingdatais,itisinsuﬃcienttoidentify\n",
            "risks solely via ex post facto usage and analysis. Rather, as has been done for other LLMs, we performed\n",
            "various kinds of proactive risk identiﬁcation, colloquially called “red teaming,“ based on the term commonly\n",
            "used within computer security. This kind of granular analysis is very important because safety is a long-tail\n",
            "issue,inwhichevenveryinfrequentedgecasescancausenoticeableproblems. Evenifquantitativescores\n",
            "report good results, these types of qualitative insights allow us to recognize and target speciﬁc patterns in a\n",
            "more comprehensive way.\n",
            "We conducted a series of red teaming with various groups of internal employees, contract workers, and\n",
            "----------\n"
          ]
        }
      ],
      "source": [
        "query(\"can you tell me about red teaming for llama 2?\")"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "query(\"what is the performance of llama 2?\")"
      ],
      "metadata": {
        "id": "0bHX57WoXr2g",
        "outputId": "392cbdea-10c4-4fd5-c1b9-1c0c40eedba0",
        "colab": {
          "base_uri": "https://localhost:8080/"
        }
      },
      "execution_count": 17,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "the provided license and our Acceptable Use Policy , which prohibit any uses that would violate applicable\n",
            "policies, laws, rules, and regulations.\n",
            "Wealsoprovidecodeexamplestohelpdevelopersreplicateoursafegenerationswith L/l.sc/a.sc/m.sc/a.sc /two.taboldstyle-C/h.sc/a.sc/t.sc and\n",
            "applybasicsafetytechniquesattheuserinputandmodeloutputlayers. Thesecodesamplesareavailable\n",
            "here: https://github.com/facebookresearch/llama . Finally,wearesharinga ResponsibleUseGuide ,which\n",
            "provides guidelines regarding safe development and deployment.\n",
            "ResponsibleRelease. WhilemanycompanieshaveoptedtobuildAIbehindcloseddoors,wearereleasing\n",
            "L/l.sc/a.sc/m.sc/a.sc /two.taboldstyle openly to encourage responsible AI innovation. Based on our experience, an open approach draws\n",
            "uponthecollectivewisdom,diversity,andingenuityoftheAI-practitionercommunitytorealizethebeneﬁtsof\n",
            "thistechnology. Collaborationwillmakethesemodelsbetterandsafer. TheentireAIcommunity—academic\n",
            "researchers, civil society, policymakers, and industry—must work together to rigorously analyze and expose\n",
            "----------\n",
            "models will be released as we improve model safety with community feedback.\n",
            "License A custom commercial license is available at: ai.meta.com/resources/\n",
            "models-and-libraries/llama-downloads/\n",
            "Where to send commentsInstructions on how to provide feedback or comments on the model can be\n",
            "found in the model README, or by opening an issue in the GitHub repository\n",
            "(https://github.com/facebookresearch/llama/ ).\n",
            "Intended Use\n",
            "Intended Use Cases L/l.sc/a.sc/m.sc/a.sc /two.taboldstyle is intended for commercial and research use in English. Tuned models\n",
            "are intended for assistant-like chat, whereas pretrained models can be adapted\n",
            "for a variety of natural language generation tasks.\n",
            "Out-of-Scope Uses Use in any manner that violates applicable laws or regulations (including trade\n",
            "compliancelaws). UseinlanguagesotherthanEnglish. Useinanyotherway\n",
            "that is prohibited by the Acceptable Use Policy and Licensing Agreement for\n",
            "L/l.sc/a.sc/m.sc/a.sc /two.taboldstyle.\n",
            "Hardware and Software (Section 2.2)\n",
            "Training Factors We usedcustomtraininglibraries, Meta’sResearchSuperCluster, andproductionclustersforpretraining. Fine-tuning,annotation,andevaluationwerealso\n",
            "----------\n",
            "our responsible release strategy can be found in Section 5.3.\n",
            "Theremainderofthispaperdescribesourpretrainingmethodology(Section2),ﬁne-tuningmethodology\n",
            "(Section 3), approach to model safety (Section 4), key observations and insights (Section 5), relevant related\n",
            "work (Section 6), and conclusions (Section 7).\n",
            "‡https://ai.meta.com/resources/models-and-libraries/llama/\n",
            "§We are delaying the release of the 34B model due to a lack of time to suﬃciently red team.\n",
            "¶https://ai.meta.com/llama\n",
            "‖https://github.com/facebookresearch/llama\n",
            "4\n",
            "Figure 4: Training of L/l.sc/a.sc/m.sc/a.sc /two.taboldstyle-C/h.sc/a.sc/t.sc : This process begins with the pretraining ofL/l.sc/a.sc/m.sc/a.sc /two.taboldstyle using publicly\n",
            "availableonlinesources. Followingthis,wecreateaninitialversionof L/l.sc/a.sc/m.sc/a.sc /two.taboldstyle-C/h.sc/a.sc/t.sc throughtheapplication\n",
            "----------\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "query(\"what is the best llm?\")"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "2VUqwF-com3z",
        "outputId": "8f93ef8a-0b87-4f0c-ce85-0010131e6e27"
      },
      "execution_count": 18,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "EL- LEX 7 67.5 70.0 69.5 69.0 42.9 47.7 63.4 55.7 63.6 41.4 53.6 55.4 25.1 49.9 46.5 58.6 61.8 44.8 52.3 52.8\n",
            "MF1-LEX 3 63.3 63.4 61.7 62.8 41.0 58.9 53.6 50.1 64.7 40.3 54.8 53.9 31.3 49.8 33.5 38.9 67.8 42.4 47.3 46.0\n",
            "MF1-LEX 7 64.7 64.3 63.4 64.1 46.3 58.1 61.3 54.6 65.7 46.4 53.1 55.5 36.5 53.1 36.1 44.4 70.2 43.1 52.2 49.2\n",
            "MF10\n",
            "KM EANS-LEX 3 64.8 61.9 64.2 63.7 35.9 52.0 56.0 51.7 57.5 41.5 50.3 51.1 30.3 47.4 46.2 52.6 69.2 44.1 49.2 52.3\n",
            "MF10\n",
            "----------\n",
            "worse on multi-turn conversations, which could be due to its lack of multi-turn supervised ﬁne-tuning data.\n",
            "InFigure19,weshowtheper-categorysafetyviolationpercentageofdiﬀerentLLMs. Whilemodelperformanceissimilaracrosscategories, L/l.sc/a.sc/m.sc/a.sc /two.taboldstyle-C/h.sc/a.sc/t.sc hasrelativelymoreviolationsunderthe unqualiﬁedadvice\n",
            "category (although still low in an absolute sense), for various reasons, including lack of an appropriate\n",
            "disclaimer (e.g., “I am not a professional” ) at times. For the other two categories, L/l.sc/a.sc/m.sc/a.sc /two.taboldstyle-C/h.sc/a.sc/t.sc achieves\n",
            "comparable or lower violation percentage consistently regardless of model sizes.\n",
            "Truthfulness,Toxicity,andBias. In Table 14, ﬁne-tuned L/l.sc/a.sc/m.sc/a.sc /two.taboldstyle-C/h.sc/a.sc/t.sc shows great improvement over the\n",
            "----------\n",
            "2 Language model, LM 73.78 28 .53 89 .79 85 .23 78 .68 84 .22 84 .00 84 .88 88 .70 74 .94 75 .77 84 .84 58 .84 38 .97 17 .54 36 .37 53 .81 64 .55 56 .51 64 .22 59 .92 71 .43 64 .00 53 .04 1 .05 46 .81 45 .78 58 .84 56 .74 69 .23 25 .23 34 .31 25 .38\n",
            "2 Prefix LM, LM 79.68 41 .26 92 .09 90 .11 86 .27 86 .82 86 .32 88 .35 91 .35 81 .71 82 .02 89 .04 68 .59 39 .66 17 .84 37 .13 76 .87 85 .39 64 .86 71 .47 93 .37 91 .07 57 .00 58 .67 16 .89 59 .25 58 .16 64 .26 66 .30 71 .15 26 .28 37 .51 26 .76\n",
            "----------\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "query(\"what is the difference between gpt-4 and llama 2?\")"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "sF8xvtyaoom4",
        "outputId": "b058cc73-01cc-4045-a649-758cdd0756c3"
      },
      "execution_count": 19,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "(ii)For GPT-4 results alone, the translated responses show superior performance over the generated\n",
            "response in Chinese, probably because GPT-4 is trained in richer English corpus than Chinese, which\n",
            "leads to stronger English instruction-following ability. In Figure 5 (c), we show results for all models\n",
            "who are asked to answer in Chinese.\n",
            "We compare LLaMA-GPT4 with GPT-4 and Alpaca unnatural instructions in Figure 6. In terms of the\n",
            "average ROUGE-L scores, Alpaca outperforms the other two models. We note that LLaMA-GPT4 and\n",
            "GPT4 is gradually performing better when the ground truth response length is increasing, eventually\n",
            "showing higher performance when the length is longer than 4. This means that they can better follow\n",
            "instructions when the scenarios are more creative. Across different subsets, LLaMA-GPT4 can\n",
            "7\n",
            "0-2 3-5 6-10 10>\n",
            "Groundtruth Response Length0.30.40.5RougeL\n",
            "-0.043\n",
            "-0.009+0.0132-0.004 +0.0562\n",
            "+0.0387-0.012\n",
            "----------\n",
            "to GPT-3 corresponds to the Stanford Alpaca model. From Figure 3(a), we observe that ( i) For the\n",
            "“Helpfulness” criterion, GPT-4 is the clear winner with 54.12% of the votes. GPT-3 only wins 19.74%\n",
            "of the time. ( ii) For the “Honesty” and “Harmlessness” criteria, the largest portion of votes goes\n",
            "to the tie category, which is substantially higher than the winning categories but GPT-3 (Alpaca) is\n",
            "slightly superior.\n",
            "Second, we compare GPT-4-instruction-tuned LLaMA models against the teacher model GPT-4 in\n",
            "Figure 3(b). The observations are quite consistent over the three criteria: GPT-4-instruction-tuned\n",
            "LLaMA performs similarly to the original GPT-4. We conclude that learning from GPT-4 generated\n",
            "5\n",
            "60% 70% 80% 90% 100%12345BRanking Group 94% 624 : 66792% 614 : 67091% 623 : 68289% 597 : 66989% 605 : 67891% 609 : 666\n",
            "----------\n",
            "GPT3-b01 42.7 45.2y41.0 41.4y37.1 39.1y32.0 33.4y35.0 35.2y\n",
            "GPT3-c01 41.3 40.8 44.6 45.1y38.9 39.5y31.6 33.2y36.1 45.1y\n",
            "GPT3-d01 40.0 40.1 46.6 47.5y40.5 41.0y32.4 34.3y36.0 33.9\n",
            "GPT3-d03 43.7 43.4 45.2 44.9 41.1 40.3 36.3 38.1y35.2 38.0y\n",
            "GPT2-M 36.0 39.2y34.6 35.3y28.1 30.7y28.3 28.3 41.8 43.3y\n",
            "GPT2-L 36.4 39.8y33.7 34.4y29.4 31.5y27.8 28.1y39.6 41.3y\n",
            "GPT2-XL 35.3 39.9y35.9 36.1y31.2 33.1y28.1 28.0 40.4 41.0y\n",
            "----------\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "---"
      ],
      "metadata": {
        "id": "d2UP-QJzo610"
      }
    }
  ],
  "metadata": {
    "accelerator": "GPU",
    "colab": {
      "machine_shape": "hm",
      "provenance": [],
      "gpuType": "V100"
    },
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    },
    "language_info": {
      "name": "python"
    },
    "widgets": {
      "application/vnd.jupyter.widget-state+json": {
        "d2476f6811e748598e4b537723473326": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HBoxModel",
          "model_module_version": "1.5.0",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "HBoxModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "HBoxView",
            "box_style": "",
            "children": [
              "IPY_MODEL_557cafac9f624e62b64e556a83dae01e",
              "IPY_MODEL_4a86079fae934818ad8a8f9fc80f269a",
              "IPY_MODEL_11afb771042947bb800e88c542909fff"
            ],
            "layout": "IPY_MODEL_70c0b9ec72294e7a9aa33c87eba22547"
          }
        },
        "557cafac9f624e62b64e556a83dae01e": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "model_module_version": "1.5.0",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "HTMLModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "HTMLView",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_558d5abb111742bcb079f48f2f009851",
            "placeholder": "​",
            "style": "IPY_MODEL_ae5d64fc9fe0461db91127cd6cb9bb0f",
            "value": "100%"
          }
        },
        "4a86079fae934818ad8a8f9fc80f269a": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "FloatProgressModel",
          "model_module_version": "1.5.0",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "FloatProgressModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "ProgressView",
            "bar_style": "success",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_63f9d536cece42558f45003f89edb68a",
            "max": 5198,
            "min": 0,
            "orientation": "horizontal",
            "style": "IPY_MODEL_0289dde60afa489f984ec52ba75aa172",
            "value": 5198
          }
        },
        "11afb771042947bb800e88c542909fff": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "model_module_version": "1.5.0",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "HTMLModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "HTMLView",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_3b2e86e7c3304d2e9118eca2d1ddee78",
            "placeholder": "​",
            "style": "IPY_MODEL_f9fabbef12cd4668bea38fe02afff650",
            "value": " 5198/5198 [1:41:38&lt;00:00,  1.22s/it]"
          }
        },
        "70c0b9ec72294e7a9aa33c87eba22547": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "558d5abb111742bcb079f48f2f009851": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "ae5d64fc9fe0461db91127cd6cb9bb0f": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "DescriptionStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "description_width": ""
          }
        },
        "63f9d536cece42558f45003f89edb68a": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "0289dde60afa489f984ec52ba75aa172": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "ProgressStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "ProgressStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "bar_color": null,
            "description_width": ""
          }
        },
        "3b2e86e7c3304d2e9118eca2d1ddee78": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "f9fabbef12cd4668bea38fe02afff650": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "DescriptionStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "description_width": ""
          }
        }
      }
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}